Notes: look for sex-antagonism or specifically, try and identify SNPs that affect one sex in one direction and the other sex in the opposite direction.
The preparation of data is adapted from Holman and Wong’s DGRP GWAS of fitness in different contexts. See their associated workflowrreport for a comprehensive breakdown of their analysis.
Load packages
Code
library(tidyverse) # tidy coding, ggplot etc
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(glue) # for coding within stringslibrary(bigsnpr) # to install: devtools::install_github("privefl/bigsnpr")
Loading required package: bigstatsr
Code
library(pander) # for tableslibrary(future.apply) # for parallel code running
Loading required package: future
Code
library(brms) # for bayesian models
Loading required package: Rcpp
Loading 'brms' package (version 2.20.4). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').
Attaching package: 'brms'
The following object is masked from 'package:stats':
ar
Code
library(ggtext) # for markdown syntax in ggplotlibrary(MetBrewer) # for more colour paletteslibrary(patchwork) # building composite plotslibrary(DT) # for nice tableslibrary(ggrepel)library(pheatmap)# build a helper function that produces a table to display our data# Create a function to build HTML searchable tablesmy_data_table <-function(df){datatable( df, rownames=FALSE,autoHideNavigation =TRUE,extensions =c("Scroller", "Buttons"),options =list(autoWidth =TRUE,dom ='Bfrtip',deferRender=TRUE,scrollX=TRUE, scrollY=1000,scrollCollapse=TRUE,buttons =list('pageLength', 'colvis', 'csv', list(extend ='pdf',pageSize ='A4',orientation ='landscape',filename ='Lifespan_data')),pageLength =100 ) )}
Load in DGRP variant annotations
The annotation text file was downloaded from the DGRP website: .
We use the site class of each variant for the downstream mutational burden analysis.
Code
options(future.globals.maxSize =2000*1024^2, stringsAsFactors =FALSE)# Helper function to split a vector into chunks chunker <-function(x, max_chunk_size) split(x, ceiling(seq_along(x) / max_chunk_size))get_variant_annotations <-function(){# Load up the big annotation file, get pertinent info. It's stored in some sort of text string format annot <-read.table("data/Input/dgrp.fb557.annot.txt", header =FALSE, stringsAsFactors =FALSE) get.info <-function(rows){lapply(rows, function(row){ site.class.field <-strsplit(annot$V3[row], split ="]")[[1]][1] num.genes <-str_count(site.class.field, ";") +1 output <-cbind(rep(annot$V1[row], num.genes), do.call("rbind", lapply(strsplit(site.class.field, split =";")[[1]], function(x) strsplit(x, split ="[|]")[[1]])))if(ncol(output) ==5) return(output[,c(1,2,4,5)]) # only return SNPs that have some annotation. Don't get the gene symbolelsereturn(NULL) }) %>%do.call("rbind", .) }plan("multisession") variant.details <-future_lapply(chunker(1:nrow(annot), max_chunk_size =10000), get.info) %>%do.call("rbind", .) %>%as.data.frame()names(variant.details) <-c("SNP", "FBID", "site.class", "distance.to.gene") variant.details$FBID <-unlist(str_extract_all(variant.details$FBID, "FBgn[:digit:]+")) # clean up text strings for Flybase ID variant.details %>% dplyr::filter(site.class !="FBgn0003638") %>%# NB this is a bug in the DGRP's annotation filemutate(chr =str_remove_all(substr(SNP, 1, 2), "_")) # get chromosome now for faster sorting later}if(!file.exists("data/Derived/annotations.csv")){get_variant_annotations() %>%write_csv("data/Derived/annotations.csv")} else annotations <-read_csv("data/Derived/annotations.csv")
Rows: 4622971 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): SNP, FBID, site.class, chr
dbl (1): distance.to.gene
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Building the dataset
Here’s the raw data from each study. This was then passed here and later returned…
Code
# Load genotyped lines reported in Huang et al for cross-matching and filteringgenotyped_lines <-read_csv("data/Input/Genotyped_lines.csv") %>%mutate(line =as.factor(line),Genotyped ="YES")
Rows: 205 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (1): line
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 13438 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (7): line, Vial_ID, Lifespan, Block, Temperature, Density, Prop_female
lgl (1): Diet
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Arya_lines <- Arya_raw_data %>%distinct(line) %>%inner_join(genotyped_lines) %>%nrow() # differs from Ivanov because they collected a little bit more data in that study and combined with Ivanov. Those data were not emailed.
Rows: 70209 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Vial_ID2, Mated
dbl (8): Line, Vial_ID, Lifespan, Block, Temperature, Prop_female, Density, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_lines <- Huang_raw_data %>%distinct(line) %>%inner_join(genotyped_lines) %>%nrow() # perfect match with study
Rows: 67605 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (8): Line, Lifespan, Week, Replicate, Diet, Temperature, Prop_female, De...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 4132 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Week_1_fecundity, Week_3_fecundity, Week_5_fecundity, Week_7_fecun...
dbl (10): Fly_ID, Block, Diet, Line, Thorax_length, Lifespan, Lifetime_fecun...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 3860 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Line, Study, Sex, Mated
dbl (5): Lifespan, Block, Temperature, Prop_female, Density
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 3860 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Line, Study, Sex, Mated
dbl (5): Lifespan, Block, Temperature, Prop_female, Density
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(Line)`
Rows: 15397 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Mated, Study
dbl (9): Line, Block, Lifespan, Replicate, Prop_female, Temperature, Diet, D...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 8478 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (8): Line, Lifespan, Vial, Block, Diet, Temperature, Prop_female, Density
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Zhao_raw_data <-read_csv("data/Input/Raw_data/tidy_Zhao_2022.csv") %>%rename(Vial_ID = Vial) %>%mutate(line =as.factor(Line),Sex =as.factor(Sex),Vial_ID =as.factor(Vial_ID),Block =as.factor(Block),Diet =as.numeric(10),Adult_age_at_start =0, # they were 3 days old upon entrance, but this is already accounted for in the death day measureSocial_group_size = Density,Density = Density /47,Death_day = Lifespan,Lifespan = Death_day + Adult_age_at_start) %>%select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 2023 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Sex, Study, Mated, Diet
dbl (7): Block, Line, Lifespan, Vial, Temperature, Density, Prop_female
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# combine them into a single tibbleAll_raw_data <-bind_rows(Arya_raw_data, Huang_raw_data, Wilson_raw_data, Durham_raw_data, Patel_raw_data, Dick_raw_data, Hoffman_raw_data, Zhao_raw_data) %>%filter(!is.na(Lifespan)) %>%rename(Yeast_percentage = Diet) %>%left_join(genotyped_lines, by ="line") %>%mutate(Genotyped =if_else(is.na(Genotyped), "NO", Genotyped),Density =round(Density, 3))# write_csv(All_raw_data, file ="data/Input/all_raw_data.csv")#my_data_table(All_raw_data)
Column explanations
line: the DGRP line (genotype)
Sex: is the fly a female or a male?
Vial_ID: in many of the studies, multiple flies were kept together in the same environment. It is common practice to record which flies shared the same environment, because a common environment can explain why some flies might exhibit similar lifespan e.g. if humidity is slightly lower in one environment than another, an increased risk of desiccation is shared by all flies in that environment. It is called Vial_ID because the most common way to house Drosophila is in cylindrical vials.
Lifespan: the number of days the fly survived as an adult.
Adult_age_at_start: often, a few days elapsed before adult flies entered the lifespan assay. Thus, for comparison across studies, it is important to add these days to the Death_day count to find the true adult lifespan.
Death_day: the number of days the fly survived as an adult, that were tracked by the lifespan assay.
Block: because of the large number of DGRP lines, often studies split their experiment into separate blocks, each of which assays a fraction of the total number of lines. These blocks are temporally separated, and generally deal with different generations of flies. Note that blocks are rarely balanced with respect to line e.g. all the sampling of line 10 is done in block 1.
Study: the study that first reports the data.
Temperature: the temperature, in degrees Celsius, that the lifespan assay was completed at.
Yeast_percentage: the overall % of the food recipe made up by brewers yeast. This is a common ingredient in Drosophila food, and the key source of protein. Several studies in our dataset were specifically interested in restriction of protein availability as a method to extend lifespan.
Social_group_size: the number of flies housed together in a vial (or bottle), at the onset of the fitness assay. As individuals died, social group sizes decreased.
Density: the number of flies per cm^3 of available space. Calculated as Social_group_size / 47cm^3 for Drosophila vials and Social_group_size / 150cm^3 for Drosophila bottles.
Mated: across our identified studies, three mating states were used in the lifespan assays. Flies were either kept as virgins for their entire lives (Mated = NO), were allowed to mate for a short period prior to the lifespan assay (Mated = Prior to assay), or kept in mixed sex groups and thus allowed to mate throughout their lives.
Prop_female: the proportion of females kept within each environment.
Genotyped: indicates whether the DGRP line has sequence data available. Most do, but a few lines included in the early studies (pre-2012) did not end up getting genotyped.
Genetic analysis of life expectancy and lifespan inequality
In part 1 of this study, we calculated mean values and standard error for each combination of line, sex, temperature and mating status. These data are displayed, and can be downloaded from, the below table. Note that for GWA and other SNP based analysis, we removed lines that had not been genotyped (shown as Genotyped = NO).
Rows: 2815 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Sex, Mated, Study, Density_cat, best_mod
dbl (12): Line, Temperature, Block, e0, SE_e0, h, SE_h, samp, a, b, a_SE, b_SE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
my_data_table(full_dataset)
For CPASSOC, we split the data into their respective treatments. For the purposes of this analysis, we follow a common convention in quantitative genetics and consider each lifespan measurement a separate trait e.g. lifespan at 25 degrees is a separate trait from lifespan at 18 degrees.
Code
Arya_2010_f <-full_dataset %>%filter(Study =="Arya_2010"& Sex =="Female"& Genotyped =="YES")Arya_2010_m <-full_dataset %>%filter(Study =="Arya_2010"& Sex =="Male"& Genotyped =="YES")Huang_2020_f_18 <- full_dataset %>%filter(Study =="Huang_2020"& Sex =="Female"& Temperature ==18)Huang_2020_m_18 <- full_dataset %>%filter(Study =="Huang_2020"& Sex =="Male"& Temperature ==18)Huang_2020_f_25 <- full_dataset %>%filter(Study =="Huang_2020"& Sex =="Female"& Temperature ==25)Huang_2020_m_25 <- full_dataset %>%filter(Study =="Huang_2020"& Sex =="Male"& Temperature ==25)Huang_2020_f_28 <- full_dataset %>%filter(Study =="Huang_2020"& Sex =="Female"& Temperature ==28)Huang_2020_m_28 <- full_dataset %>%filter(Study =="Huang_2020"& Sex =="Male"& Temperature ==28)Wilson_2020_f <- full_dataset %>%filter(Study =="Wilson_2020") %>%distinct(line, .keep_all =TRUE)Durham_2014_f <- full_dataset %>%filter(Study =="Durham_2014") %>%distinct(line, .keep_all = T)Patel_2021_f <- full_dataset %>%filter(Study =="Patel_2021")
Finally, we also load the residual line means for each sex, which are the line means after controlling for temperature, mating status and the Study_ID random effect.
Rows: 659 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Sex
dbl (3): line, res_e0, res_h
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(line)`
Performing GWAS
Install neccessary software and build helper functions
In addition to the R packages we load, plink 1.9 and beagle must also be installed. These software packages allow us to wrangle the data into the correct format and impute missing values, both of which are required to run GWAS with the Gemma R package.
Code
# build a function to prepare data for GWASprep_for_e0_GWAS <-function(data, sex){data %>%#inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>% # filter to lines that have been genotypedmutate(line =glue("line{line}")) %>%select(line, e0)}prep_for_h_GWAS <-function(data, sex){data %>%#inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>% # filter to lines that have been genotypedmutate(line =glue("line{line}")) %>%select(line, h)}# I used bigsnpr::download_plink(dir = "code") which puts it in the code folder - I'm using a windows operating system.# Beagle is a software package for phasing genotypes and imputing ungenotyped markers.plink <-paste(getwd(), "code/plink", sep ="/")beagle <- bigsnpr::download_beagle()# # Load the wolbachia infection status data from the DGRP website# wolbachia <- read_csv("data/input/wolbachia.csv") %>% # mutate(line = paste("line", line, sep = ""))# # # Load the chomosomal inversion data from the DGRP website# # these are the 5 inversions that Huang et al. PNAS corrected for# inversions <- read_csv("data/input/inversion genotypes.csv") %>%# mutate(line = paste("line", line, sep = "")) %>%# select(line, `In(2L)t`, `In(2R)NS`, `In(3R)P`, `In(3R)K`, `In(3R)Mo`) # helper function to pass commands to the terminal# Note that we set `intern = TRUE`, and pass the result of `system()` to `cat()`, # ensuring that the Terminal output will be printed in this knitr report.# # This is the mac OS function#run_command <- function(shell_command, wd = getwd(), path = ""){# cat(system(glue("cd ", wd, path, # tell terminal which directory to work in# "\n",shell_command), # on a second terminal line, run the plink command# intern = TRUE), sep = '\n') #}# This is the windows function run_command <-function(plink_command, wd =getwd(), path ="") {# Specify the full path to the plink executable within the 'code' subdirectory. plink_path <-paste(getwd(), "code/plink", sep ="/")# Create the full shell command with the plink executable. command <-glue("cmd.exe /c cd /d {shQuote(file.path(wd, path))} && {shQuote(plink_path)} {plink_command}")# Execute the combined command. result <-system(command, intern =TRUE)# Print the result.cat(result, sep ='\n')# Return the result as a character vector.return(result)}# sometimes we need to run terminal commands without plink - create a slightly different function to do thisrun_command_no_plink <-function(shell_command, wd =getwd(), path ="") {# Create the full shell command with the plink executable. command <-glue("cmd.exe /c cd /d {shQuote(file.path(wd, path))} && {shell_command}")# Execute the combined command. result <-system(command, intern =TRUE)# Print the result.cat(result, sep ='\n')# Return the result as a character vector.return(result)}
Build a function to create manhattan plots for later
Code
build_manhattan_plot <-function(gwas_results){ manhattan_data <- gwas_results %>%mutate(position =str_split(SNP, "_"), # split the SNP name into logical bitschr =map_chr(position, ~ .x[1]), # the first bit is the chromosome arm - name the column appropriatelyposition =as.numeric(map_chr(position, ~ .x[2])), # where on the chromosome is the SNP foundpval =-1*log10(P)) %>%# make visualising the p values easierselect(chr, position, pval)# this next chunk finds convenient cuts for labels and colour changes max_pos <- manhattan_data %>%group_by(chr) %>%summarise(max_pos =max(position), middle_pos = (max_pos -min(position)) /2,.groups ="drop") %>%as.data.frame() max_pos$max_pos <-c(0, cumsum(max_pos$max_pos[1:5])) max_pos$label_pos <- max_pos$max_pos + max_pos$middle_pos# combine the two dataframes created above manhattan_data <- manhattan_data %>%left_join(max_pos, by ="chr") %>%mutate(position = position + max_pos,chromosome_colour =case_when(chr =="2L"| chr =="3L"| chr =="4"~"A",.default ="B"),Notable =if_else(pval >=8, "YES", "NO")) plot <- manhattan_data %>%#filter(Notable == "NO") %>% ggplot(aes(position, pval, group = chr, stroke =0.01)) +geom_point(aes(colour = chromosome_colour), size =1.6) +geom_hline(yintercept =8, linetype =2, colour ="red", linewidth =1.2) +scale_colour_manual(values =c(met.brewer(name ="Hokusai3")[3], met.brewer(name ="Hokusai3")[6])) +#geom_label_repel(data = manhattan_data %>% filter(common_SNP == "YES" &), # aes(label = position, fill = "aliceblue")) +#geom_point(data = manhattan_data %>% filter(Notable == "YES"),# fill = "aliceblue", size = 3, # shape = 21, colour = "grey2") +scale_x_continuous(breaks = max_pos$label_pos, labels = max_pos$chr) +scale_y_continuous(expand =c(0,0), limits =c(0, 40)) +ylab("-log~10~(_p_)") +xlab("Chromosome and position") +theme_classic() +theme(legend.position ="none",axis.title.y =element_markdown(size =18),axis.title.x =element_markdown(size =18),axis.text.x =element_text(size =15),axis.text.y =element_text(size =15)) }
Perform SNP quality control and impute missing data
We cleaned up the DGRP’s .bed/.bim/.fam files (available from the Mackay lab website) by:
Removing any SNPs for which genotypes are missing for >10% of the DGRP lines. We then use the software Beagle to impute the remaining missing genotypes. Imputation takes a long time so ideally, you only want to do it once.
Removing SNPs with a minor allele frequency of less than 5%. We have negilible power to detect associations for rare SNPs below this threshold.
In the PLINK-formatted genotype files, lines fixed for the major allele are coded as 2, and lines fixed for the minor allele as 0. This means that in the association tests we calculate, negative effect sizes mean that the minor allele is associated with lower trait values, while positive effect sizes means that the minor allele is associated with higher trait values.
Code
Run_function <-FALSE# Change this to TRUE to run the models# If the imputation is not already done, create the following 3 files of imputed genotype data: # dgrp2_QC_all_lines_imputed_correct.bed/bim/famif(!file.exists("data/Derived/dgrp2_QC_all_lines_imputed_correct.bed")) {perform_SNP_QC_and_imputation(phenotypes = predicted_line_means)}if(Run_function){# Use Plink to clean and subset the DGRP's SNP data as follows:# Only keep SNPs for which at least 90% of DGRP lines were successfully genotyped (--geno 0.1)# Only keep SNPs with a minor allele frequency of 0.05 or higher (--maf 0.05)# Finally, write the processed BIM/BED/FAM files to the data/derived directory output_directory <-paste(getwd(), "data/Derived", sep ="/")run_command(glue("--bfile dgrp2"," --geno 0.1 --maf 0.05 --allow-no-sex", " --make-bed --out {shQuote(output_directory)}/dgrp2_QC_all_lines"), path ="data/Input/")# Use the shell command 'sed' to remove underscores from the DGRP line names in the .fam file (e.g. 'line_120' becomes 'line120')# Otherwise, these underscores cause trouble when we need to convert from PLINK to vcf format (vcf format uses underscore as a separator)# I manually edited the text file to do this rather than using the below command, which I don't have quite working#for(i in 1:2) run_command_no_plink("sed -i '' 's/_//' dgrp2_QC_all_lines.fam", path = "/data/Derived/")# Now impute the missing genotypes using Beagle# This part uses the data for the full DGRP panel of >200 lines, to infer missing genotypes as accurately as possible. # This step uses a lot of memory (max memory was set to 28MB in prior analysis, and it used 26.5GB), but maybe it can also run on a less powerful computer?# The bigsnpr package provides a helpful wrapper for Beagle called snp_beagleImpute(): it translates to a VCF file and back again using PLINKsnp_beagleImpute(beagle, plink, bedfile.in ="data/Derived/dgrp2_QC_all_lines.bed", bedfile.out ="data/Derived/dgrp2_QC_all_lines_imputed.bed",ncores =4, memory.max =16)# assign a sex of 'female' to all the DGRP lines (Beagle removes the sex, and it seems PLINK needs individuals to have a sex, # despite PLINK having a command called --allow-no-sex)run_command("sed -i '' 's/ 0 0 0/ 0 0 2/' dgrp2_QC_all_lines_imputed.fam", path ="/data/Derived/")# Re-write the .bed file, to make sure the MAF threshold and minor/major allele designations are correctly assigned post-Beaglerun_command(glue("--bfile dgrp2_QC_all_lines_imputed"," --geno 0.1 --maf 0.05 --allow-no-sex", " --make-bed --out dgrp2_QC_all_lines_imputed_correct"), path ="/data/Derived/")#unlink(list.files("data/derived", pattern = "~", full.names = TRUE)) # delete the original files, which were given a ~ file name by PLINK} else"Already run"
[1] "Already run"
Create a reduced list of LD-pruned SNPs with PLINK
To keep the computation time and memory usage manageable, we do not save the effect sizes for every variant (SNPs and indels) that passed the MAF and missingness quality control step above (1,646,672 variants). Instead, we save the effect sizes for a subset of variants that were approximately in linkage disequilibrium. We identified this LD-pruned set of variants using the PLINK arguments --indep-pairwise 100 10 0.2, which prunes within 100kB sliding windows, sliding 10 variants along with each step, and allows a maximum pairwise \(r^2\) threshold of 0.2 between loci. With these parameters, 1,419,902 variants were removed, leaving 226,770 for downstream analysis. Subsequent inspection of the Manhattan plots suggests that this method removes most (but not all) variants that are in complete or high LD.
Code
# indep-pairwise arguments are: # 100kB window size, # variant count to shift the window by 10 variants at the end of each step, # pairwise r^2 threshold of 0.2run_command(glue("--bfile dgrp2_QC_all_lines_imputed_correct"," --indep-pairwise 100 10 0.2"), path ="data/Derived/")
[1] "PLINK v1.90b7 64-bit (16 Jan 2023) www.cog-genomics.org/plink/1.9/"
[2] "(C) 2005-2023 Shaun Purcell, Christopher Chang GNU General Public License v3"
[3] "Logging to plink.log."
[4] "Options in effect:"
[5] " --bfile dgrp2_QC_all_lines_imputed_correct"
[6] " --indep-pairwise 100 10 0.2"
[7] ""
[8] "15779 MB RAM detected; reserving 7889 MB for main workspace."
[9] "1646672 variants loaded from .bim file."
[10] "205 people (0 males, 205 females) loaded from .fam."
[11] "Using 1 thread (no multithreaded calculations invoked)."
[12] "Before main variant filters, 205 founders and 0 nonfounders present."
[13] "Calculating allele frequencies... 0%\b\b1%\b\b2%\b\b3%\b\b4%\b\b5%\b\b6%\b\b7%\b\b8%\b\b9%\b\b10%\b\b\b11%\b\b\b12%\b\b\b13%\b\b\b14%\b\b\b15%\b\b\b16%\b\b\b17%\b\b\b18%\b\b\b19%\b\b\b20%\b\b\b21%\b\b\b22%\b\b\b23%\b\b\b24%\b\b\b25%\b\b\b26%\b\b\b27%\b\b\b28%\b\b\b29%\b\b\b30%\b\b\b31%\b\b\b32%\b\b\b33%\b\b\b34%\b\b\b35%\b\b\b36%\b\b\b37%\b\b\b38%\b\b\b39%\b\b\b40%\b\b\b41%\b\b\b42%\b\b\b43%\b\b\b44%\b\b\b45%\b\b\b46%\b\b\b47%\b\b\b48%\b\b\b49%\b\b\b50%\b\b\b51%\b\b\b52%\b\b\b53%\b\b\b54%\b\b\b55%\b\b\b56%\b\b\b57%\b\b\b58%\b\b\b59%\b\b\b60%\b\b\b61%\b\b\b62%\b\b\b63%\b\b\b64%\b\b\b65%\b\b\b66%\b\b\b67%\b\b\b68%\b\b\b69%\b\b\b70%\b\b\b71%\b\b\b72%\b\b\b73%\b\b\b74%\b\b\b75%\b\b\b76%\b\b\b77%\b\b\b78%\b\b\b79%\b\b\b80%\b\b\b81%\b\b\b82%\b\b\b83%\b\b\b84%\b\b\b85%\b\b\b86%\b\b\b87%\b\b\b88%\b\b\b89%\b\b\b90%\b\b\b91%\b\b\b92%\b\b\b93%\b\b\b94%\b\b\b95%\b\b\b96%\b\b\b97%\b\b\b98%\b\b\b99%\b\b\b\b done."
[14] "Total genotyping rate is exactly 1."
[15] "1646672 variants and 205 people pass filters and QC."
[16] "Note: No phenotypes present."
[17] "\r1%\r2%\r3%\r4%\r5%\r6%\r7%\r8%\r9%\r10%\r11%\r12%\r13%\r14%\r15%\r16%\r17%\r18%\r19%\r20%\r21%\r22%\r23%\r24%\r25%\r26%\r27%\r28%\r29%\r30%\r31%\r32%\r33%\r34%\r35%\r36%\r36%\r37%\r38%\r39%\r40%\r41%\r41%\r42%\r43%\r44%\r45%\r46%\r46%\r47%\r48%\r49%\r50%\r51%\r51%\r52%\r53%\r54%\r55%\r56%\r56%\r57%\r58%\r59%\r60%\r61%\r61%\r62%\r63%\r64%\r65%\r66%\r67%\r68%\r69%\r70%\r71%\r72%\r73%\r74%\r75%\r76%\r77%\r78%\r79%\r80%\r81%\r82%\r83%\r84%\r85%\r86%\r87%\r88%\r89%\r90%\r91%\r92%\r93%\r94%\r95%\r96%\r97%\r98%\r99%\rPruned 309454 variants from chromosome 1, leaving 49143."
[18] "\r1%\r2%\r3%\r4%\r5%\r6%\r7%\r8%\r9%\r10%\r10%\r11%\r12%\r13%\r14%\r15%\r16%\r16%\r17%\r18%\r19%\r20%\r21%\r22%\r22%\r23%\r24%\r25%\r26%\r27%\r28%\r29%\r30%\r31%\r32%\r33%\r34%\r35%\r36%\r37%\r38%\r39%\r39%\r40%\r41%\r42%\r43%\r44%\r45%\r45%\r46%\r47%\r48%\r49%\r50%\r51%\r51%\r52%\r53%\r54%\r55%\r56%\r57%\r58%\r59%\r60%\r61%\r62%\r63%\r64%\r65%\r66%\r67%\r68%\r68%\r69%\r70%\r71%\r72%\r73%\r74%\r74%\r75%\r76%\r77%\r78%\r79%\r80%\r81%\r82%\r83%\r84%\r85%\r86%\r87%\r88%\r89%\r90%\r91%\r91%\r92%\r93%\r94%\r95%\r96%\r97%\r97%\r98%\r99%\rPruned 280344 variants from chromosome 2, leaving 45483."
[19] "\r0%\r1%\r2%\r3%\r4%\r5%\r6%\r7%\r8%\r9%\r10%\r11%\r12%\r13%\r14%\r15%\r16%\r17%\r18%\r19%\r19%\r20%\r20%\r21%\r22%\r23%\r24%\r25%\r26%\r27%\r28%\r29%\r30%\r31%\r32%\r33%\r34%\r35%\r36%\r37%\r38%\r38%\r39%\r39%\r40%\r41%\r42%\r43%\r44%\r45%\r46%\r47%\r48%\r49%\r50%\r51%\r52%\r53%\r54%\r55%\r56%\r57%\r57%\r58%\r58%\r59%\r60%\r61%\r62%\r63%\r64%\r65%\r66%\r67%\r68%\r69%\r70%\r71%\r72%\r73%\r74%\r75%\r76%\r76%\r77%\r77%\r78%\r79%\r80%\r81%\r82%\r83%\r84%\r85%\r86%\r87%\r88%\r89%\r90%\r91%\r92%\r93%\r94%\r95%\r96%\r96%\r97%\r97%\r98%\r99%\rPruned 340105 variants from chromosome 3, leaving 51947."
[20] "\r0%\r1%\r2%\r3%\r4%\r5%\r6%\r7%\r8%\r9%\r10%\r11%\r12%\r12%\r13%\r14%\r15%\r16%\r17%\r18%\r19%\r20%\r21%\r22%\r23%\r24%\r24%\r25%\r26%\r27%\r28%\r29%\r30%\r31%\r32%\r33%\r34%\r35%\r36%\r36%\r37%\r38%\r39%\r40%\r41%\r42%\r43%\r44%\r45%\r46%\r47%\r48%\r48%\r49%\r50%\r51%\r52%\r53%\r54%\r55%\r56%\r57%\r58%\r59%\r60%\r60%\r61%\r61%\r62%\r63%\r64%\r65%\r66%\r67%\r68%\r69%\r70%\r71%\r72%\r73%\r73%\r74%\r75%\r76%\r77%\r78%\r79%\r80%\r81%\r82%\r83%\r84%\r85%\r85%\r86%\r87%\r88%\r89%\r90%\r91%\r92%\r93%\r94%\r95%\r96%\r97%\r97%\r98%\r99%\rPruned 284134 variants from chromosome 4, leaving 42948."
[21] "\r1%\r2%\r3%\r4%\r5%\r6%\r6%\r7%\r8%\r9%\r10%\r11%\r12%\r13%\r13%\r14%\r15%\r16%\r17%\r18%\r19%\r20%\r20%\r21%\r22%\r23%\r24%\r25%\r26%\r27%\r27%\r28%\r29%\r30%\r31%\r32%\r33%\r34%\r34%\r35%\r36%\r37%\r38%\r39%\r40%\r41%\r42%\r43%\r44%\r45%\r46%\r47%\r48%\r49%\r50%\r51%\r52%\r53%\r54%\r55%\r56%\r57%\r58%\r59%\r60%\r60%\r61%\r62%\r63%\r64%\r65%\r66%\r67%\r67%\r68%\r69%\r70%\r71%\r72%\r73%\r74%\r74%\r75%\r76%\r77%\r78%\r79%\r80%\r81%\r81%\r82%\r83%\r84%\r85%\r86%\r87%\r88%\r88%\r89%\r90%\r91%\r92%\r93%\r94%\r95%\r96%\r97%\r98%\r99%\rPruned 203413 variants from chromosome 5, leaving 37161."
[22] "\r1%\r1%\r2%\r3%\r4%\r5%\r6%\r7%\r8%\r9%\r10%\r11%\r12%\r12%\r13%\r14%\r15%\r16%\r17%\r18%\r19%\r20%\r21%\r22%\r23%\r24%\r25%\r25%\r26%\r27%\r28%\r29%\r30%\r31%\r32%\r33%\r34%\r35%\r36%\r37%\r38%\r38%\r39%\r40%\r41%\r42%\r43%\r44%\r45%\r46%\r47%\r48%\r49%\r50%\r51%\r51%\r52%\r53%\r54%\r55%\r56%\r57%\r58%\r59%\r60%\r61%\r62%\r62%\r63%\r64%\r65%\r66%\r67%\r68%\r69%\r70%\r71%\r72%\r73%\r74%\r75%\r75%\r76%\r77%\r78%\r79%\r80%\r81%\r82%\r83%\r84%\r85%\r86%\r87%\r88%\r88%\r89%\r90%\r91%\r92%\r93%\r94%\r95%\r96%\r97%\r98%\r99%\rPruned 2452 variants from chromosome 6, leaving 88."
[23] "Pruning complete. 1419902 of 1646672 variants removed."
[24] "Writing...\rMarker lists written to plink.prune.in and plink.prune.out ."
Code
unlink("data/Derived/plink.prune.out")
Build GWAS function
Note: because PLINK defines the minor allele as the alt allele (so, lines fixed for the minor allele are scored as genotype: 2, and those with the major allele as genotype: 0), a positive effect size in these association tests means the minor allele is associated with a higher value of the trait in question.
Code
run_GWAS <-function(phenotypes){# Make a list of the lines in our sample and save as a text file for passing to PLINK lines_to_keep <- phenotypes %>%select(line) %>%mutate(line_2 = line)write.table(lines_to_keep, row.names =FALSE, col.names =FALSE, file ="data/Derived/lines_to_keep.txt", quote =FALSE)# Now cull the PLINK files to just the lines that we measured, and re-apply the # MAF cut-off of 0.05 for the new smaller sample of DGRP linesrun_command(glue("--bfile dgrp2_QC_all_lines_imputed_correct"," --keep-allele-order", # force PLINK to retain the major/minor allele designations that apply to the DGRP as a whole" --keep lines_to_keep.txt --geno 0.1 --maf 0.05", " --make-bed --out dgrp2_QC_focal_lines"), path ="/data/Derived/")# Define a function to add our phenotype data to a .fam file, which is needed for GWAS analysis and to make sure PLINK includes these samples# The 'phenotypes' data frame needs to have a column called 'line' add_phenotypes_to_fam <-function(filepath, phenotypes){read_delim(filepath, col_names =FALSE, delim =" ") %>%select(X1, X2, X3, X4, X5) %>%# Get all the non-phenotype columnsleft_join(phenotypes, by =c("X1"="line")) %>%write.table(file ="data/Derived/dgrp2_QC_focal_lines_NEW.fam", col.names =FALSE, row.names =FALSE, quote =FALSE, sep =" ")unlink("data/Derived/dgrp2_QC_focal_lines.fam")file.rename("data/Derived/dgrp2_QC_focal_lines_NEW.fam", "data/Derived/dgrp2_QC_focal_lines.fam") }# edit the new FAM file to add the phenotype data from 'phenotypes'add_phenotypes_to_fam("data/Derived/dgrp2_QC_focal_lines.fam", phenotypes)# Run mixed-model GWAS (in practice, the relatedness between almost all pairs of lines # is sufficiently low that PLINK always instead chooses to run a linear model)run_command("--bfile dgrp2_QC_focal_lines --assoc --maf 0.05", path ="/data/Derived")# wrangle the GWAS results Focal_name <-deparse(substitute(phenotypes)) gwas_results <-read.table("data/Derived/plink.qassoc", header =TRUE) %>%select(SNP, BETA, SE, "T", P)# Save a file containing all SNPs with P < 1e-5: gwas_results %>%filter(P <1e-05) %>%write_tsv(glue("data/Derived/GWAS_results/{Focal_name}_significant_SNPs.tsv.gz"))# Rename and compress the GWAS summary stats file # The filter step means that only variants in the LD-pruned subset get saved to disk. gwas_results %>%filter(SNP %in% (pull(read_tsv("data/Derived/plink.prune.in", col_names ="SNP"), SNP))) %>%write_tsv(glue("data/Derived/GWAS_results/{Focal_name}.tsv.gz"))unlink("data/Derived/plink.qassoc")# Rename the plink log filefile.rename("data/Derived/plink.log",glue("data/Derived/{Focal_name}_log.txt"))unlink("data/Derived/dgrp2_QC_focal_lines.bim")unlink("data/Derived/dgrp2_QC_focal_lines.bed")unlink("data/Derived/dgrp2_QC_focal_lines.fam")unlink("data/Derived/dgrp2_QC_focal_lines.log")}
# Extract and save the MAFs, SNP positions, and major/minor allelesMAFs <-read.table("data/derived/plink.frq", header =TRUE, stringsAsFactors =FALSE) %>%mutate(position =map_chr(strsplit(SNP, split ="_"), function(x) x[2])) %>%select(SNP, position, MAF, A1, A2) %>%rename(minor_allele = A1,major_allele = A2) MAFs <- annotations %>%select(SNP, FBID, site.class, distance.to.gene, chr) %>%collect() %>%full_join(MAFs, by ="SNP") %>%filter(!is.na(MAF)) %>%mutate(site.class =replace(site.class, is.na(site.class), "INTERGENIC")) %>%as_tibble()# Delete the original variant annotation table from the db, and add the new one back in
Applying cross-phenotype meta-analysis
The power to detect variants associated with correlated phenotypes can be increased if a meta-analytic approach is adopted (Zhu et al, 2018). Here, we use the CPASSOC approach developed by Zhou et al (2015), which evaluates the null hypothesis that SNPs are associated with none of the considered traits, weighted by the sample size of each study and adjusted for the trait correlation matrix. The steps to apply CPMA are as follows:
Estimate \(R\), the trait correlation matrix. In the DGRP, this can be done using SNP data or using line means.
GWAS each trait separately (see above).
Collate effect sizes for each trait into a vector \(\beta\), for each SNP.
Use a Wald test statistic to estimate a vector of p-values, \(T\), from the \(\beta\) and \(\sigma\) estimates for each SNP.
Test whether \(\beta\) = 0. If the trait data are homogeneous, use:
\[S_{Hom} = \frac{e^T(RW)^{-1}T(e^T(RW)^{-1}T)^T}{e^T(WRW)^{-1}e}\] where \(W\) is a diagonal matrix of weights for the individual test statistics (either the inverse of the variance or simply the sample size).
If there is heterogeneity between trait measures, \(S_{Hom}\) is not appropriate. The ideal test statistic in this case would only include the cohorts and traits with a true contribution to the association of a genetic variant. To implement this, the value \(\tau\) is used as a threshold, below which traits are not included in the test statistic. This statistic, \(S_{Het}\) can be viewed as the maximum of the weighted sum of trait-specific test statistics satisfying different thresholds. To calculate \(S_{Het}\) first find,
\[S_{\tau} = \frac{e^T(R(\tau)W(\tau))^{-1}T(\tau)(e^T(R(\tau)W(\tau))^{-1}T(\tau))^T}{e^TW(\tau)^{-1}R(\tau)^{-1}W(\tau)^{-1}e}\] When \(\tau\) is large, \(S(\tau)\) can be undefined if the test statistic falls below \(\tau\) for all cohorts. In this case \(S(\tau) = 0\). Our test statistic is then
Anyway, I’ve only included that for the mathematically inclined. Code to implement both statistics in R can be downloaded here, and is implemented below.
Note that the MASS package is required to run the functions we load below. Unfortunately this clashes with dplyrsselect(). So be prepared to run detach("package:MASS", unload=TRUE) to get some things to work if you’re moving back and forth throughout the code.
Code
source("FunctionSet.R") # saved as a separate file in the project.
Attaching package: 'MASS'
The following object is masked from 'package:patchwork':
area
The following object is masked from 'package:dplyr':
select
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loading required package: compiler
e0, sexes combined
The purpose of this meta-analysis is to search for SNPs that have some effect on ageing, expressed in many different contexts (sexes, temperatures and mating status’).
To conduct CPASSOC for a given SNP, we need a summary statistic from each GWAS. A different number of lines were included in each GWAS, which can cause small differences in the number of SNPs assessed. We therefore prune the list of SNPs to those included in all univariate analyses. After pruning (and previous pruning to remove SNPs in linkage disequilibrium), 220,582 SNPs remain.
Code
# load GWAS resultsArya_f_l_GWAS <-read_tsv("data/Derived/GWAS_results/Arya_f_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_1 = T)Arya_m_l_GWAS <-read_tsv("data/Derived/GWAS_results/Arya_m_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_2 = T)Huang_f_18_l_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_18_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_3 = T)Huang_m_18_l_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_m_18_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_4 = T)Huang_f_25_l_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_25_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_5 = T)Huang_m_25_l_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_m_25_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_6 = T)Huang_f_28_l_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_28_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_7 = T)Huang_m_28_l_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_m_28_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_8 = T)Wilson_f_l_GWAS <-read_tsv("data/Derived/GWAS_results/Wilson_f_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_9 = T)Durham_f_l_GWAS <-read_tsv("data/Derived/GWAS_results/Durham_f_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_10 = T)Patel_f_l_GWAS <-read_tsv("data/Derived/GWAS_results/Patel_f_l.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_11 = T)all_e0_effect_sizes <- Arya_f_l_GWAS %>%inner_join(Arya_m_l_GWAS, by ="SNP") %>%inner_join(Huang_f_18_l_GWAS, by ="SNP") %>%inner_join(Huang_m_18_l_GWAS, by ="SNP") %>%inner_join(Huang_f_25_l_GWAS, by ="SNP") %>%inner_join(Huang_m_25_l_GWAS, by ="SNP") %>%inner_join(Huang_f_28_l_GWAS, by ="SNP") %>%inner_join(Huang_m_28_l_GWAS, by ="SNP") %>%inner_join(Wilson_f_l_GWAS, by ="SNP") %>%inner_join(Durham_f_l_GWAS, by ="SNP") %>%inner_join(Patel_f_l_GWAS, by ="SNP") all_e0_effect_sizes_values <- all_e0_effect_sizes %>% dplyr::select(2:12)Sample_size_all <-c(165, 165, 183, 183, 186, 186, 177, 177, 161, 189, 193) S_hom <-SHom(all_e0_effect_sizes_values, Sample_size_all, all_e0_corr_matrix)# calculate meta-p-values and bind the two together with the SNP namesp_S_hom <-pchisq(S_hom, df =1, ncp =0, lower.tail = F) %>%as_tibble() %>%bind_cols(S_hom) %>%rename(meta_p_hom = value, S_hom = ...2)# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)# estimate parameters of gamma distributionpara <-EstimateGamma(N =1E4, Sample_size_all, all_e0_corr_matrix);S_het <-SHet(all_e0_effect_sizes_values, Sample_size_all, all_e0_corr_matrix)# obtain P-values of S_Het using the estimated gamma parametersp_S_het <-pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>%as_tibble() %>%bind_cols(S_het) %>%rename(meta_p_het = value, S_het = ...2)all_e0_meta_results <- all_e0_effect_sizes %>%bind_cols(p_S_hom, p_S_het) # add the unadjusted p valuesall_e0_meta_results <-read_csv("data/Derived/all_e0_meta_results.csv")#write_csv(all_e0_meta_results, "data/Derived/all_e0_meta_results.csv")
h sexes combined
Code
# load GWAS resultsArya_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Arya_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_1 = T)Arya_m_h_GWAS <-read_tsv("data/Derived/GWAS_results/Arya_m_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_2 = T)Huang_f_18_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_18_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_3 = T)Huang_m_18_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_m_18_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_4 = T)Huang_f_25_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_25_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_5 = T)Huang_m_25_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_m_25_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_6 = T)Huang_f_28_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_28_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_7 = T)Huang_m_28_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_m_28_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_8 = T)Wilson_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Wilson_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_9 = T)Durham_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Durham_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_10 = T)Patel_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Patel_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_11 = T)all_h_effect_sizes <-Arya_f_h_GWAS %>%inner_join(Arya_m_h_GWAS, by ="SNP") %>%inner_join(Huang_f_18_h_GWAS, by ="SNP") %>%inner_join(Huang_m_18_h_GWAS, by ="SNP") %>%inner_join(Huang_f_25_h_GWAS, by ="SNP") %>%inner_join(Huang_m_25_h_GWAS, by ="SNP") %>%inner_join(Huang_f_28_h_GWAS, by ="SNP") %>%inner_join(Huang_m_28_h_GWAS, by ="SNP") %>%inner_join(Wilson_f_h_GWAS, by ="SNP") %>%inner_join(Durham_f_h_GWAS, by ="SNP") %>%inner_join(Patel_f_h_GWAS, by ="SNP") all_h_effect_sizes_values <- all_h_effect_sizes %>% dplyr::select(2:12)S_hom <-SHom(all_h_effect_sizes_values, Sample_size_all, all_h_corr_matrix)# calculate meta-p-values and bind the two together with the SNP namesp_S_hom <-pchisq(S_hom, df =1, ncp =0, lower.tail = F) %>%as_tibble() %>%bind_cols(S_hom) %>%rename(meta_p_hom = value, S_hom = ...2)# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)# estimate parameters of gamma distributionpara <-EstimateGamma(N =1E4, Sample_size_all, all_h_corr_matrix);S_het <-SHet(all_h_effect_sizes_values, Sample_size_all, all_h_corr_matrix)# obtain P-values of S_Het using the estimated gamma parametersp_S_het <-pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>%as_tibble() %>%bind_cols(S_het) %>%rename(meta_p_het = value, S_het = ...2)all_h_meta_results <- all_e0_effect_sizes %>%bind_cols(p_S_hom, p_S_het) # add the unadjusted p values#write_csv(all_h_meta_results, "data/Derived/all_h_meta_results.csv")all_h_meta_results <-read_csv("data/Derived/all_h_meta_results.csv")
# Combine the GWAS results from each study in a wide format. inner_join allows us to limit the matrix to SNPs evaluated in all studies (not adhering to this will break CPASSOC)Female_e0_effect_sizes <-Arya_f_l_GWAS %>%inner_join(Huang_f_18_l_GWAS, by ="SNP") %>%inner_join(Huang_f_25_l_GWAS, by ="SNP") %>%inner_join(Huang_f_28_l_GWAS, by ="SNP") %>%inner_join(Wilson_f_l_GWAS, by ="SNP") %>%inner_join(Durham_f_l_GWAS, by ="SNP") %>%inner_join(Patel_f_l_GWAS, by ="SNP") Female_e0_effect_sizes_values <- Female_e0_effect_sizes %>% dplyr::select(2:8)# use the number of lines included as the weights input for now. 1/SE would be better and I think it should be possibleSample_size <-c(165, 183, 186, 177, 161, 189, 193) # first calculate S_hom, which is very similar to typical fixed effect meta-analysis (and probably what we'll use in the paper)S_hom <-SHom(Female_e0_effect_sizes_values, Sample_size, female_e0_corr_matrix)# calculate meta-p-values and bind the two together with the SNP namesp_S_hom <-pchisq(S_hom, df =1, ncp =0, lower.tail = F) %>%as_tibble() %>%bind_cols(S_hom) %>%rename(meta_p_hom = value, S_hom = ...2)# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)# estimate parameters of gamma distributionpara <-EstimateGamma(N =1E4, Sample_size, female_e0_corr_matrix);S_het <-SHet(Female_e0_effect_sizes_values, Sample_size, female_e0_corr_matrix)# obtain P-values of S_Het using the estimated gamma parametersp_S_het <-pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>%as_tibble() %>%bind_cols(S_het) %>%rename(meta_p_het = value, S_het = ...2)e0_female_meta_results <- Female_e0_effect_sizes %>%bind_cols(p_S_hom, p_S_het) # add the unadjusted p values
Female lifespan inequality
Code
Arya_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Arya_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_1 = T)Huang_f_18_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_18_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_2 = T)Huang_f_25_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_25_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_3 = T)Huang_f_28_h_GWAS <-read_tsv("data/Derived/GWAS_results/Huang_f_28_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_4 = T)Wilson_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Wilson_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_5 = T)Durham_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Durham_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_6 = T)Patel_f_h_GWAS <-read_tsv("data/Derived/GWAS_results/Patel_f_h.tsv.gz") %>% dplyr::select(SNP, T) %>%rename(Trait_7 = T)Female_h_effect_sizes <-Arya_f_h_GWAS %>%inner_join(Huang_f_18_h_GWAS, by ="SNP") %>%inner_join(Huang_f_25_h_GWAS, by ="SNP") %>%inner_join(Huang_f_28_h_GWAS, by ="SNP") %>%inner_join(Wilson_f_h_GWAS, by ="SNP") %>%inner_join(Durham_f_h_GWAS, by ="SNP") %>%inner_join(Patel_f_h_GWAS, by ="SNP") Female_h_effect_sizes_values <- Female_h_effect_sizes %>% dplyr::select(2:8)# use the number of lines included as the weights input for now. 1/SE would be better and I think it should be possibleSample_size <-c(165, 183, 186, 177, 161, 189, 193) S_hom <-SHom(Female_h_effect_sizes_values, Sample_size, female_h_corr_matrix)# calculate meta-p-values and bind the two together with the SNP namesp_S_hom <-pchisq(S_hom, df =1, ncp =0, lower.tail = F) %>%as_tibble() %>%bind_cols(S_hom) %>%rename(meta_p_hom = value, S_hom = ...2)# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)# estimate parameters of gamma distributionpara <-EstimateGamma(N =1E4, Sample_size, female_h_corr_matrix);S_het <-SHet(Female_h_effect_sizes_values, Sample_size, female_h_corr_matrix)# obtain P-values of S_Het using the estimated gamma parametersp_S_het <-pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>%as_tibble() %>%bind_cols(S_het) %>%rename(meta_p_het = value, S_het = ...2)h_female_meta_results <- Female_h_effect_sizes %>%bind_cols(p_S_hom, p_S_het)
Estimating the genetic correlation between mean lifespan and lifespan inequality
XX
Effect of mutational burden on lifespan
In this analysis we identify mutations that are likely to be deleterious, count the number of these putatively deleterious mutations per DGRP line, and test whether there is a negative association between mutational burden and lifespan.
The unguarded X
The homozygous nature of the DGRP makes this difficult.
But we can test if the X is purged of male-harming mutations more-so than female-harming mutations - that is, test the ‘faster X’ hypothesis.
Data preparation
We classified mutations as putatively deleterious if:
They are rare in the DGRP (<5% minor allele frequency)
They occur in a coding region
They have an effect that is likely to incur some functional change.
Following Holman and Wong (2023), we use the SnpEff designation of variants with major effects as those likely to incur a functional change. These variants are:
EXON_DELETED
NON_SYNONYMOUS_CODING
FRAME_SHIFT
CODON_CHANGE
CODON_INSERTION
CODON_CHANGE_PLUS_CODON_INSERTION
CODON_DELETION
CODON_CHANGE_PLUS_CODON_DELETION
STOP_GAINED
STOP_LOST
RARE_AMINO_ACID
START_LOST
START_GAINED
Code
# Load the DGRP genotype data using bigsnprif(!file.exists("snp_backing_file.bk")) {rds <-snp_readBed("data/Input/dgrp2.bed", backingfile ="snp_backing_file")}bed_file <-snp_attach("snp_backing_file.rds")annotations <-read.table("data/Input/dgrp.fb557.annot.txt", header =FALSE, stringsAsFactors =FALSE)# Use PLINK to get the allele frequencies across the 205 DGRP linesrun_command("--bfile dgrp2 --freq", path ="/data/Input")major_effect_types <-c("EXON_DELETED", "NON_SYNONYMOUS_CODING","FRAME_SHIFT", "CODON_CHANGE","CODON_INSERTION","CODON_CHANGE_PLUS_CODON_INSERTION","CODON_DELETION","CODON_CHANGE_PLUS_CODON_DELETION","STOP_GAINED","STOP_LOST","RARE_AMINO_ACID","START_LOST","START_GAINED")# this chunk detects and filters for any of the above variant types in the annotation datasetall_major_mutations <- annotations$V1[unique(c(which(str_detect(annotations$V3, major_effect_types[1])),which(str_detect(annotations$V3, major_effect_types[2])),which(str_detect(annotations$V3, major_effect_types[3])),which(str_detect(annotations$V3, major_effect_types[4])),which(str_detect(annotations$V3, major_effect_types[5])),which(str_detect(annotations$V3, major_effect_types[6])),which(str_detect(annotations$V3, major_effect_types[7])),which(str_detect(annotations$V3, major_effect_types[8])),which(str_detect(annotations$V3, major_effect_types[9])),which(str_detect(annotations$V3, major_effect_types[10])),which(str_detect(annotations$V3, major_effect_types[11])),which(str_detect(annotations$V3, major_effect_types[12])),which(str_detect(annotations$V3, major_effect_types[13]))))]# now filter down to those with 0 < MAF < 0.05rare_alleles <-read.table("data/Input/plink.frq", header = T) %>%filter(MAF <0.05& MAF >0) %>%pull(SNP)# find alleles on the XX_linked_alleles <-read.table("data/Input/plink.frq", header = T) %>%filter(CHR ==5) %>%pull(SNP)# and those on autosomesautosomal_alleles <-read.table("data/Input/plink.frq", header = T) %>%filter(CHR !=5) %>%pull(SNP)# Get the indexes of variants that are major mutations and also have MAF < 0.05indexes <-intersect(which(bed_file$map$marker.ID %in% all_major_mutations), which(bed_file$map$marker.ID %in% rare_alleles))# Get the indexes of variants that are major mutations and also have MAF < 0.05 and that are on the Xindexes_X <-intersect( indexes,which(bed_file$map$marker.ID %in% X_linked_alleles))indexes_a <-intersect( indexes,which(bed_file$map$marker.ID %in% autosomal_alleles))# Function to count the number of mutations in all 205 DGRP linescount_mutations <-function(indexes){tibble(line = bed_file$fam$family.ID,mutation_count =sapply(1:205, function(i) sum(bed_file$genotypes[i, ][indexes] ==2, na.rm = T)) ) %>%mutate(line =str_remove(line, "_"))}mutational_burden <-count_mutations(indexes)X_burden <-count_mutations(indexes_X)autosomal_burden <-count_mutations(indexes_a)# Integrate mutational burden with line lifespan data mutational_burden_line_data <-left_join(Arya_female_lifespan %>%rename(Female_lifespan = Lifespan), Arya_male_lifespan %>%rename(Male_lifespan = Lifespan)) %>%left_join(mutational_burden,by ="line")
Run the bayesian linear regression
Code
mutational_burden_line_data <- mutational_burden_line_data %>%mutate(Female_lifespan_standard = (Female_lifespan -mean(Female_lifespan))/sd(Female_lifespan),Male_lifespan_standard = (Male_lifespan -mean(Male_lifespan))/sd(Male_lifespan),mutation_count_100 = mutation_count/100)# run this on individual dataindividual_level_data <- Arya_raw_data %>%select(line, Lifespan, Sex) %>%mutate(line =as.character(line),line =paste("line", line, sep ="")) %>%group_by(Sex) %>%mutate(row =row_number()) %>%pivot_wider(names_from = Sex, values_from = Lifespan) %>%select(-row) %>%right_join(mutational_burden, by ="line") %>%rename(Female_lifespan = F, Male_lifespan = M) %>%mutate(Female_lifespan_standard = (Female_lifespan -mean(Female_lifespan, na.rm = T))/sd(Female_lifespan, na.rm = T),Male_lifespan_standard = (Male_lifespan -mean(Male_lifespan, na.rm = T))/sd(Male_lifespan, na.rm = T),mutation_count_100 = mutation_count/100)# raw dataindividual_multivariate_mutational_burden_model <-brm(bf(mvbind(Female_lifespan, Male_lifespan) ~ mutation_count_100 + (1|line)) +set_rescor(TRUE), data = individual_level_data,family = gaussian,prior =c(prior(normal(60, 20), class = Intercept, resp = Femalelifespan),prior(normal(60, 20), class = Intercept, resp = Malelifespan),prior(normal(0, 2), class = b, resp = Femalelifespan),prior(normal(0, 2), class = b, resp = Malelifespan),prior(normal(1, 1), class = sigma, resp = Femalelifespan),prior(normal(1, 1), class = sigma, resp = Malelifespan),prior(lkj(2), class = rescor)),warmup =2000, iter =6000, chains =4, cores =4)# meansmultivariate_mutational_burden_model <-brm(bf(mvbind(Female_lifespan, Male_lifespan) ~ mutation_count_100) +set_rescor(TRUE), data = mutational_burden_line_data,family = gaussian,prior =c(prior(normal(60, 20), class = Intercept, resp = Femalelifespan),prior(normal(60, 20), class = Intercept, resp = Malelifespan),prior(normal(0, 2), class = b, resp = Femalelifespan),prior(normal(0, 2), class = b, resp = Malelifespan),prior(normal(1, 1), class = sigma, resp = Femalelifespan),prior(normal(1, 1), class = sigma, resp = Malelifespan),prior(lkj(2), class = rescor)),warmup =2000, iter =6000, chains =4, cores =4)multivariate_mutational_burden_model